# import required libraries
import pandas as pd
import numpy as np
import json
import requests
import seaborn as sns
import folium
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from geopy.geocoders import Nominatim
# import k-means from clustering stage
from sklearn.cluster import KMeans
import folium
# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors
# Xgboost
import xgboost as xgb
from sklearn.metrics import explained_variance_score, mean_squared_error, r2_score
from numpy import loadtxt
from xgboost import XGBClassifier
from xgboost import plot_tree
First, I'll use the Foursquare API to create clusters according to most common venues in NYC
# !wget -q -O 'newyork_data.json' https://cocl.us/new_york_dataset
with open('newyork_data.json') as json_data:
newyork_data = json.load(json_data)
# define the dataframe columns
column_names = ['Borough', 'Neighborhood', 'Latitude', 'Longitude']
# instantiate the dataframe
neighborhoods = pd.DataFrame(columns=column_names)
neighborhoods_data = newyork_data['features']
for data in neighborhoods_data:
borough = neighborhood_name = data['properties']['borough']
neighborhood_name = data['properties']['name']
neighborhood_latlon = data['geometry']['coordinates']
neighborhood_lat = neighborhood_latlon[1]
neighborhood_lon = neighborhood_latlon[0]
neighborhoods = neighborhoods.append({'Borough': borough,
'Neighborhood': neighborhood_name,
'Latitude': neighborhood_lat,
'Longitude': neighborhood_lon}, ignore_index=True)
# download the data from http://insideairbnb.com/get-the-data.html
# city New York
# http://data.insideairbnb.com/united-states/ny/new-york-city/2019-12-04/data/listings.csv.gz
# and loat it to a dataframe
df0 = pd.read_csv('./data/listings.csv',low_memory=False)
df0.head()
CLIENT_ID = '###' # your Foursquare ID
CLIENT_SECRET = '###' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version
radius = 500 # define radius
LIMIT = 50 # define the limit of venues queried
# function that extracts the category of the venue
def get_category_type(row):
try:
categories_list = row['categories']
except:
categories_list = row['venue.categories']
if len(categories_list) == 0:
return None
else:
return categories_list[0]['name']
def getNearbyVenues(names, latitudes, longitudes, radius=500):
venues_list=[]
for name, lat, lng in zip(names, latitudes, longitudes):
print(name)
# create the API request URL
url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
CLIENT_ID,
CLIENT_SECRET,
VERSION,
lat,
lng,
radius,
LIMIT)
# make the GET request
results = requests.get(url).json()["response"]['groups'][0]['items']
# return only relevant information for each nearby venue
venues_list.append([(
name,
lat,
lng,
v['venue']['name'],
v['venue']['location']['lat'],
v['venue']['location']['lng'],
v['venue']['categories'][0]['name']) for v in results])
nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
nearby_venues.columns = ['Neighborhood',
'Neighborhood Latitude',
'Neighborhood Longitude',
'Venue',
'Venue Latitude',
'Venue Longitude',
'Venue Category']
return(nearby_venues)
print(neighborhoods.head())
print(len(neighborhoods))
# get the data from Foursquare
# run once to save API calls and save the data
#nyc_venues = getNearbyVenues(names=neighborhoods['Neighborhood'],
# latitudes=neighborhoods['Latitude'],
# longitudes=neighborhoods['Longitude']
# )
nyc_venues = pd.read_csv('./nyc_venues.csv')
# one hot encoding
nyc_onehot = pd.get_dummies(nyc_venues[['Venue Category']], prefix="", prefix_sep="")
# add neighborhood column back to dataframe
nyc_onehot['Neighborhood'] = nyc_venues['Neighborhood']
# move neighborhood column to the first column
fixed_columns = [nyc_onehot.columns[-1]] + list(nyc_onehot.columns[:-1])
nyc_onehot = nyc_onehot[fixed_columns]
nyc_onehot.head()
nyc_grouped = nyc_onehot.groupby('Neighborhood').mean().reset_index()
nyc_grouped
def return_most_common_venues(row, num_top_venues):
row_categories = row.iloc[1:]
row_categories_sorted = row_categories.sort_values(ascending=False)
return row_categories_sorted.index.values[0:num_top_venues]
num_top_venues = 10
indicators = ['st', 'nd', 'rd']
# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
try:
columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
except:
columns.append('{}th Most Common Venue'.format(ind+1))
# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = nyc_grouped['Neighborhood']
for ind in np.arange(nyc_grouped.shape[0]):
neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(nyc_grouped.iloc[ind, :], num_top_venues)
neighborhoods_venues_sorted.head()
# set number of clusters
kclusters = 5
nyc_grouped_clustering = nyc_grouped.drop('Neighborhood', 1)
# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(nyc_grouped_clustering)
# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10]
# add clustering labels
# neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)
neighborhoods_venues_sorted['Cluster Labels'] = kmeans.labels_
nyc_merged = neighborhoods
# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
nyc_merged = nyc_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')
nyc_merged.head() # check the last columns!
# drop a nan
nyc_merged = nyc_merged[nyc_merged['Cluster Labels'] > -1]
len(nyc_merged)
# generate the map
# The geograpical coordinate of New York City are 40.7127281, -74.0060152.
latitude = 40.7127281
longitude = -74.0060152
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11, tiles="OpenStreetMap")
# set color scheme for the clusters
#x = np.arange(kclusters)
#ys = [i + x + (i*x)**2 for i in range(kclusters)]
#colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
#rainbow = [colors.rgb2hex(i) for i in colors_array]
colors = ['red','green','yellow','blue','black']
# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(nyc_merged['Latitude'], nyc_merged['Longitude'], nyc_merged['Neighborhood'], nyc_merged['Cluster Labels']):
label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
folium.CircleMarker(
[lat, lon],
radius=5,
popup=label,
color=colors[int(cluster)],
fill=True,
fill_color=colors[int(cluster)],
fill_opacity=0.9).add_to(map_clusters)
map_clusters
# check the columns
list(df0.columns)
# I'll focus only in a subset of features:
# drop columns
df1 = df0[['price',
'latitude',
'longitude',
'neighbourhood_group_cleansed',
'accommodates',
'review_scores_rating',
'amenities',
'property_type',
'room_type',
'cleaning_fee',
'host_listings_count',
'availability_90',
'extra_people',
'number_of_reviews',
'bathrooms',
'bedrooms',
'beds',
'security_deposit',
'cancellation_policy',
'minimum_nights',
'maximum_nights',
'square_feet'
]]
df1.head(3)
df1['property_type'].value_counts()
# let's focus only on Appartment and Houses
df1 = df1[(df1['property_type'] == 'Apartment') |(df1['property_type'] == 'House') ]
# fill some nan values with the median
for col in ['bathrooms', 'bedrooms', 'beds']:
df1[col].fillna(df1[col].median(), inplace=True)
# important amenities to take into account
list_of_amenities = ["Air conditioning","Kitchen","Free parking on premises","Gym","Elevator","Hot tub","Washer","Laptop friendly workspace","Private entrance"]
# initialize the amenities column
for a in list_of_amenities:
df1[a] = 0
# fill the amenities columns
for a in list_of_amenities:
df1[a] = df1['amenities'].str.contains(a, regex=False) * 1
# sanity check
df1.tail()
# drop price nan values
df1.dropna(subset=['price'],inplace=True)
# cast price to int
df1['price'] = pd.to_numeric(df1['price'].replace('[\$,]', '', regex=True)).astype('int32')
# fill amenities nan with 0
df1['security_deposit'].fillna(0, inplace=True)
df1['cleaning_fee'].fillna(0, inplace=True)
df1['extra_people'].fillna(0, inplace=True)
df1[['security_deposit','cleaning_fee','extra_people']]
# cast to numeric
df1['security_deposit'] = pd.to_numeric(df1['security_deposit'].replace('[\$,]', '', regex=True)).astype('int32')
df1['cleaning_fee'] = pd.to_numeric(df1['cleaning_fee'].replace('[\$,]', '', regex=True)).astype('int32')
df1['extra_people'] = pd.to_numeric(df1['extra_people'].replace('[\$,]', '', regex=True)).astype('int32')
df1[['security_deposit','cleaning_fee','extra_people']]
# amenities have been converted to dummies, the amenities column is no longer needed
df1.drop(['amenities'],axis=1, inplace=True)
# check sanity of price
min(df1['price'])
# there are some prices=0
# there are some prices=0
# let's get rid of them
df1 = df1[df1['price'] >0]
# replace outliers
df1.loc[df1.price > 1000, 'price'] = 1000
df1.loc[df1.price < 10, 'price'] = 1000
# clean cancellation policy
# group cancellation policy
df1['cancellation_policy'].unique()
# create bins
df1['cancellation_policy_flexible'] = df1['cancellation_policy'].str.contains('flexible', regex=False)*1
df1.drop(['cancellation_policy'], axis=1, inplace=True)
# function for finding the closest lat lon
# see https://stackoverflow.com/questions/41336756/find-the-closest-latitude-and-longitude
from math import cos, asin, sqrt
def distance(lat1, lon1, lat2, lon2):
p = 0.017453292519943295
a = 0.5 - cos((lat2-lat1)*p)/2 + cos(lat1*p)*cos(lat2*p) * (1-cos((lon2-lon1)*p)) / 2
return 12742 * asin(sqrt(a))
def closest(data, v):
return min(data, key=lambda p: distance(v['Latitude'],v['Longitude'],p['Latitude'],p['Longitude']))
nyc_merged.columns
# create a column to be used as search target
nyc_merged['lat_lon'] = nyc_merged['Latitude'].astype('str') + nyc_merged['Longitude'].astype('str')
nyc_merged.head(10)
# create a dictionary for latitude longitude search
lldict = nyc_merged[['Latitude','Longitude']].to_dict(orient= 'records')
lldict[:10]
# function that finds the cluster given a row index
def find_cluster(i):
lat_lon_item = {'Latitude':df1.iloc[i]['latitude'],'Longitude':df1.iloc[i]['longitude']}
to_find = closest(lldict, lat_lon_item)
to_find_str = str(to_find['Latitude']) + str(to_find['Longitude'])
found_row_bools = nyc_merged['lat_lon'] == to_find_str
found_row_index = found_row_bools[found_row_bools].index.values[0]
#neighborhood_found = nyc_merged.iloc[found_row_index]['Neighborhood']
#return(neighborhood_found)
cluster_found = nyc_merged.iloc[found_row_index]['Cluster Labels']
return(cluster_found)
# create the cluster row
# takes some time to find the nearest locatio
# save for reuse
# df1.to_csv('./df1.csv')
df1 = pd.read_csv('./df1.csv')
df1.head() # see 'cluster' column
# generate a map or airbnb venues according to cluster
# because this takes some time, I'm mapping only the first 3000 here
tempdf = df1.iloc[:3000]
# The geograpical coordinate of New York City are 40.7127281, -74.0060152.
latitude = 40.7127281
longitude = -74.0060152
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11, tiles="OpenStreetMap")
# set color scheme for the clusters
#x = np.arange(kclusters)
#ys = [i + x + (i*x)**2 for i in range(kclusters)]
#colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
#rainbow = [colors.rgb2hex(i) for i in colors_array]
colors = ['red','green','yellow','blue','black']
# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(tempdf['latitude'], tempdf['longitude'], tempdf['neighbourhood_group_cleansed'], tempdf['cluster']):
label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
folium.CircleMarker(
[lat, lon],
radius=4,
popup=label,
color=colors[int(cluster[-1])],
fill=True,
fill_color=colors[int(cluster[-1])],
fill_opacity=0.9).add_to(map_clusters)
map_clusters
df1['cluster'].value_counts()
# lat and long no longer needed
df1.drop(['latitude'], axis=1, inplace=True)
df1.drop(['longitude'], axis=1, inplace=True)
# get dummies
df2 = pd.get_dummies(df1)
df2.head()
df2.columns
df2.drop(columns=['Unnamed: 0'], inplace=True)
# check the price distribution
sns.distplot(df2['price'])
# convert to log scale
cols_to_log = ['price','accommodates','review_scores_rating','cleaning_fee',
'host_listings_count',
'availability_90',
'extra_people',
'number_of_reviews',
'bathrooms',
'security_deposit']
for col in cols_to_log:
df2[col] = df2[col].astype('float64').replace(0.0, 0.01)
df2[col] = np.log(df2[col])
sns.distplot(df2['price'])
X = df2.drop('price', axis=1)
y = df2.price
# Scaling
scaler = StandardScaler()
X = pd.DataFrame(scaler.fit_transform(X), columns=list(X.columns))
# Splitting into train and test sets
from sklearn.model_selection import train_test_split, cross_val_score
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)
xgb_reg = xgb.XGBRegressor()
xgb_reg.fit(X_train, y_train)
training_preds_xgb_reg = xgb_reg.predict(X_train)
val_preds_xgb_reg = xgb_reg.predict(X_test)
# Evaluate
print("\nTraining MSE:", round(mean_squared_error(y_train, training_preds_xgb_reg),4))
print("Validation MSE:", round(mean_squared_error(y_test, val_preds_xgb_reg),4))
print("\nTraining r2:", round(r2_score(y_train, training_preds_xgb_reg),4))
print("Validation r2:", round(r2_score(y_test, val_preds_xgb_reg),4))
ft_weights_xgb_reg = pd.DataFrame(xgb_reg.feature_importances_, columns=['weight'], index=X_train.columns)
ft_weights_xgb_reg.sort_values('weight', ascending=False, inplace=True)
ft_weights_xgb_reg.head(10)
# Plotting feature importances
plt.figure(figsize=(10,10))
plt.barh(ft_weights_xgb_reg.index[:5], ft_weights_xgb_reg.weight[:5], align='center')
plt.title("Feature importances in the XGBoost model", fontsize=14)
plt.xlabel("Feature importance")
plt.margins(y=0.01)
plt.show()
Only entire homes and appartments
df1['room_type'].unique()
df1b = df1[df1['room_type'] == 'Entire home/apt']
df1b.drop(['room_type'], axis=1, inplace=True)
df1b.head()
# Fit the model again
df2 = pd.get_dummies(df1b)
cols_to_log = ['price','accommodates','review_scores_rating','cleaning_fee',
'host_listings_count',
'availability_90',
'extra_people',
'number_of_reviews',
'bathrooms',
'security_deposit']
for col in cols_to_log:
df2[col] = df2[col].astype('float64').replace(0.0, 0.01)
df2[col] = np.log(df2[col])
X = df2.drop('price', axis=1)
y = df2.price
scaler = StandardScaler()
X = pd.DataFrame(scaler.fit_transform(X), columns=list(X.columns))
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)
xgb_reg = xgb.XGBRegressor()
xgb_reg.fit(X_train, y_train)
training_preds_xgb_reg = xgb_reg.predict(X_train)
val_preds_xgb_reg = xgb_reg.predict(X_test)
print("\nTraining MSE:", round(mean_squared_error(y_train, training_preds_xgb_reg),4))
print("Validation MSE:", round(mean_squared_error(y_test, val_preds_xgb_reg),4))
print("\nTraining r2:", round(r2_score(y_train, training_preds_xgb_reg),4))
print("Validation r2:", round(r2_score(y_test, val_preds_xgb_reg),4))
# visualize the trees
fig, ax = plt.subplots(figsize=(40, 100))
plot_tree(xgb_reg, ax=ax)
plt.show()
# check feature importance
ft_weights_xgb_reg = pd.DataFrame(xgb_reg.feature_importances_, columns=['weight'], index=X_train.columns)
ft_weights_xgb_reg.sort_values('weight', ascending=False, inplace=True)
ft_weights_xgb_reg.head(10)
plt.figure(figsize=(20,20))
plt.barh(ft_weights_xgb_reg.index[:15], ft_weights_xgb_reg.weight[:15], align='center')
plt.title("Feature importances in the XGBoost model", fontsize=14)
plt.xlabel("Feature importance")
plt.margins(y=0.01)
plt.show()
# convert from natural logarith back to original scale
predicted_prices = [np.e**i for i in val_preds_xgb_reg]
test_prices = [np.e**i for i in y_test.values]
plt.figure(figsize=(10,10))
plt.ylim(2,500)
plt.xlim(2,500)
sns.scatterplot(predicted_prices, test_prices)
sns.lineplot(range(500),range(500), color='g')
# Discussion
plt.figure(figsize=(10,10))
plt.ylim(0, 800)
tempdf2 = df1b[(df1b['bathrooms'] == 1) | (df1b['bathrooms'] == 2)]
sns.violinplot(data=tempdf2, x='bathrooms', y='price')
plt.figure(figsize=(10,10))
plt.ylim(0, 500)
tempdf2 = df1b
sns.violinplot(data=tempdf2, x='neighbourhood_group_cleansed', y='price')
# wasing machine
plt.figure(figsize=(10,10))
plt.ylim(0, 500)
sns.violinplot(data=tempdf2, x='Washer', y='price')
# cluster
tempdf2 = df1b[(df1b['cluster'] == 'cluster_1') | (df1b['cluster'] == 'cluster_2') | (df1b['cluster'] == 'cluster_3')]
plt.figure(figsize=(10,10))
plt.ylim(0, 500)
sns.violinplot(data=tempdf2, x='cluster', y='price')